from pyspark import SparkConf
from pyspark import SparkContext
# V1.0 统计你指定的单词的数量
if __name__ == '__main__':
    conf = SparkConf().setMaster("local").setAppName("wordcount")
    sc = SparkContext(conf=conf)

    my_lst = ['spark','hello']  # 我指定的需要统计的单词

    lines = sc.textFile("../data/test1.txt")
    # lines = sc.textFile("hdfs://mynode1:8020/test.txt")
    words = lines.flatMap(lambda line: line.split(",")).filter(lambda x: x in my_lst)
    result = words.map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a + b)
    print(result.collect())
